#include "func.h"


__global__ void VecAddDim1(uint8_t *A, uint8_t *B ,uint8_t *C)
{
    int i = threadIdx.x;
    C[i] = A[i]+ B[i];
}

__global__ void VecAddDim2(uint8_t **A, uint8_t **B ,uint8_t **C)
{
    int i = threadIdx.x;
    int j = threadIdx.y;
    C[i][j] = A[i][j]+ B[i][j];
}

__global__ void VecAddDim3(uint8_t ***A, uint8_t ***B ,uint8_t ***C)
{
    int i = threadIdx.x;
    int j = threadIdx.y;
    int k = threadIdx.z;
    C[i][j][k] = A[i][j][k]+ B[i][j][k];
}
// demo:
// 1-dim matrix addition
// tips:host`s memery should copyto device`s memery
// && cuda`s data should copyto host memery 
void test_func_dim1(void)
{
    uint8_t *A,*B,*C;
    A=(uint8_t*)malloc(3);
    B=(uint8_t*)malloc(3);
    C=(uint8_t*)malloc(3);
    for(int i=0;i<3;i++){
        A[i] = i;
        B[i] = i;
    }
    uint8_t *d_a,*d_b,*d_c;
    cudaMalloc((void**)&d_a,3);
    cudaMalloc((void**)&d_b,3);
    cudaMalloc((void**)&d_c,3);
    cudaMemcpy((void*)d_a,A,3,cudaMemcpyHostToDevice);
    cudaMemcpy((void*)d_a,A,3,cudaMemcpyHostToDevice);
    VecAddDim1<<<1,3>>>(d_a,d_b,d_c);
    cudaMemcpy((uint8_t*)C,(uint8_t*)d_c,3,cudaMemcpyDeviceToHost);
    printf("\ndim1:\n");
    for(int i=0;i<3;i++){
        printf("%d ",C[i]);
    }
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
}

void test_func_dim2(void)
{
    uint8_t **A;
    uint8_t **B;
    uint8_t **C;
    uint8_t *data_a;
    uint8_t *data_b;
    uint8_t *data_c;
    A=(uint8_t **)malloc(3*sizeof(uint8_t*));
    B=(uint8_t **)malloc(3*sizeof(uint8_t*));
    C=(uint8_t **)malloc(3*sizeof(uint8_t*));
    data_a = (uint8_t *)malloc(3*3*sizeof(uint8_t));
    data_b = (uint8_t *)malloc(3*3*sizeof(uint8_t));
    data_c = (uint8_t *)malloc(3*3*sizeof(uint8_t));
    for(int i=0;i<3*3;i++){
        data_a[i] = i;
        data_b[i] = i;
        data_c[i] = 0;
    }

    uint8_t **d_a;
    uint8_t **d_b;
    uint8_t **d_c;
    uint8_t *d_data_a;
    uint8_t *d_data_b;
    uint8_t *d_data_c;
    cudaMalloc((void**)&d_a,3*sizeof(uint8_t*));
    cudaMalloc((void**)&d_b,3*sizeof(uint8_t*));
    cudaMalloc((void**)&d_c,3*sizeof(uint8_t*));
    cudaMalloc((void**)&d_data_a,3*3*sizeof(uint8_t));
    cudaMalloc((void**)&d_data_b,3*3*sizeof(uint8_t));
    cudaMalloc((void**)&d_data_c,3*3*sizeof(uint8_t));

    for(int i=0;i<3;i++){
        A[i] = (d_data_a + i*3);    //  q:为何使用d_data_a? 【原因：CPU无法访问GPU内存】
                                    
                                    //  **A指向一个一维指针数组（如果不知道这一点，建议补习一下多级指针的知识），该数组中每一个元素A[i]存放一个一级指针，用于指向二维数组每行首地址
        B[i] = (d_data_b + i*3);    //  如果此处使用data_a，则表示二级指针**A与data_a在 HOST 内存上建立映射关系，在CPU上可以以A[i][j]的形式访问该二维数组
        C[i] = (d_data_c + i*3);    //  我们希望将全部原始二维数组数据全部传入到 DEVICE 内存上，在GPU上做运算时，也能以d_A[i][j]的方式在GPU上做运算，
                                    //  就必须在GPU上也建立映射关系，否则只能使用传入的一维数组，然后以地址偏移的方式取数据参与计算
                                    //  
                                    //  d_data_a表示在GPU上开辟的内存首地址，使用cudaMemcpy的方式会将，会将A的数据传给d_A，而A中的数据是d_data_a的行首地址，相当于对d_A与d_data_a做了映射
                                    //  注：不可使用d_A[i] = d_dataA + i*3;原因还是CPU无法访问GPU内存，CPU取不到d_A[i]的值
    }
    cudaMemcpy((void**)d_a,(void**)A,3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
    cudaMemcpy((void**)d_b,(void**)B,3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
    cudaMemcpy((void**)d_c,(void**)C,3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
    cudaMemcpy((void*)d_data_a,data_a,3*3,cudaMemcpyHostToDevice);
    cudaMemcpy((void*)d_data_b,data_b,3*3,cudaMemcpyHostToDevice);
    cudaMemcpy((void*)d_data_c,data_c,3*3,cudaMemcpyHostToDevice);
    dim3 block(3,3);
    dim3 grip(1,1);
    VecAddDim2<<<grip,block>>>(d_a,d_b,d_c);

    cudaMemcpy((void*)data_c,d_data_c,3*3,cudaMemcpyDeviceToHost);
    cudaMemcpy((void*)C,d_c,3*sizeof(uint8_t*),cudaMemcpyDeviceToHost);
    //  将数据从GPU上传出后，还得再次做映射，此时两组数据没有映射关系
    for(int i=0;i<3;i++){
        C[i] = (data_c + i*3);
    }
    
    printf("\ndim2:\n");
    for(int i=0;i<3;i++){
        for(int j=0;j<3;j++){
            printf("%d ",C[i][j]);
        }
        printf("\n");
    }
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFree(d_data_a);
    cudaFree(d_data_b);
}

void test_func_dim3(void)
{
    uint8_t ***A;
    uint8_t ***B;
    uint8_t ***C;
    uint8_t **indexA;
    uint8_t **indexB;
    uint8_t **indexC;
    uint8_t *dataA;
    uint8_t *dataB;
    uint8_t *dataC;

    A = (uint8_t***)malloc(3*sizeof(uint8_t**));
    B = (uint8_t***)malloc(3*sizeof(uint8_t**));
    C = (uint8_t***)malloc(3*sizeof(uint8_t**));
    indexA = (uint8_t**)malloc(3*3*sizeof(uint8_t*));
    indexB = (uint8_t**)malloc(3*3*sizeof(uint8_t*));
    indexC = (uint8_t**)malloc(3*3*sizeof(uint8_t*));
    dataA = (uint8_t*)malloc(3*3*3*sizeof(uint8_t));
    dataB = (uint8_t*)malloc(3*3*3*sizeof(uint8_t));
    dataC = (uint8_t*)malloc(3*3*3*sizeof(uint8_t));

    for(int i=0;i<3*3*3;i++){
        dataA[i] = i;
        dataB[i] = i;
        dataC[i] = 0;
    }
    uint8_t ***d_A;
    uint8_t ***d_B;
    uint8_t ***d_C;
    uint8_t **d_indexA;
    uint8_t **d_indexB;
    uint8_t **d_indexC;
    uint8_t *d_dataA;
    uint8_t *d_dataB;
    uint8_t *d_dataC;
    cudaMalloc((void**)&d_A,3*sizeof(uint8_t**));
    cudaMalloc((void**)&d_B,3*sizeof(uint8_t**));
    cudaMalloc((void**)&d_C,3*sizeof(uint8_t**));
    cudaMalloc((void**)&d_indexA,3*3*sizeof(uint8_t*));
    cudaMalloc((void**)&d_indexB,3*3*sizeof(uint8_t*));
    cudaMalloc((void**)&d_indexC,3*3*sizeof(uint8_t*));
    cudaMalloc((void**)&d_dataA,3*3*3*sizeof(uint8_t));
    cudaMalloc((void**)&d_dataB,3*3*3*sizeof(uint8_t));
    cudaMalloc((void**)&d_dataC,3*3*3*sizeof(uint8_t));
    for(int i=0;i<3;i++){
        A[i] = d_indexA + 3*i;
        B[i] = d_indexB + 3*i;
        C[i] = d_indexC + 3*i;
    }
    for(int i=0;i<3*3;i++){
        indexA[i] = d_dataA + 3*i;
        indexB[i] = d_dataB + 3*i;
        indexC[i] = d_dataC + 3*i;
    }
    cudaMemcpy((void***)d_A,(void***)A,3*sizeof(uint8_t**),cudaMemcpyHostToDevice);
    cudaMemcpy((void***)d_B,(void***)B,3*sizeof(uint8_t**),cudaMemcpyHostToDevice);
    cudaMemcpy((void***)d_C,(void***)C,3*sizeof(uint8_t**),cudaMemcpyHostToDevice);
    cudaMemcpy((void**)d_indexA,(void**)indexA,3*3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
    cudaMemcpy((void**)d_indexB,(void**)indexB,3*3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
    cudaMemcpy((void**)d_indexC,(void**)indexC,3*3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
    cudaMemcpy((void*)d_dataA,(void*)dataA,3*3*3*sizeof(uint8_t),cudaMemcpyHostToDevice);
    cudaMemcpy((void*)d_dataB,(void*)dataB,3*3*3*sizeof(uint8_t),cudaMemcpyHostToDevice);
    cudaMemcpy((void*)d_dataC,(void*)dataC,3*3*3*sizeof(uint8_t),cudaMemcpyHostToDevice);
    dim3 block(3,3,3);
    dim3 grip(1,1,1);
    VecAddDim3<<<grip,block>>>(d_A,d_B,d_C);
    cudaMemcpy((void***)C,(void***)d_C,3*sizeof(uint8_t**),cudaMemcpyDeviceToHost);
    cudaMemcpy((void**)indexC,(void**)d_indexC,3*3*sizeof(uint8_t*),cudaMemcpyDeviceToHost);
    cudaMemcpy((void*)dataC,(void*)d_dataC,3*3*3*sizeof(uint8_t),cudaMemcpyDeviceToHost);
    for(int i=0;i<3;i++){
        C[i] = indexC + 3*i;
    }
    for(int i=0;i<3*3;i++){
        indexC[i] = dataC + 3*i;
    }
    printf("\ndim3:\n");
    for(int i=0;i<3;i++){
        for(int j=0;j<3;j++){
            printf("(%d %d %d)",C[0][i][j],C[1][i][j],C[2][i][j]);
        }
        printf("\n");
    }
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cudaFree(d_indexA);
    cudaFree(d_indexB);
    cudaFree(d_indexC);
    cudaFree(d_dataA);
    cudaFree(d_dataB);
    cudaFree(d_dataC);
}
